library(tantale)
library(ggplot2)
library(tidyverse)
library(Biostrings)
library(parallel)
library(magrittr)
library(DT)
library(gplots)
library(ape)
library(ggtree)

This vignette will go through all steps in a talomes analysis pipeline from Xoo genome sequences using functions in package tantale.
Genomes used for this vignette are MAI1, BAI3, and BAI3-1-1 (an error-prone genome with TalC deleted).

Get the fasta files:

mai1_fa <- system.file("extdata", "MAI1.fa", package = "tantale", mustWork = T)
bai3_fa <- system.file("extdata", "BAI3.fa", package = "tantale", mustWork = T)
bai311_fa <- system.file("extdata", "BAI3-1-1.fa", package = "tantale", mustWork = T)
pxo86 <- system.file("extdata", "PXO86.fa", package = "tantale", mustWork = T)

Output dir to save all the results of this vignette:

outdir <- fs::dir_create("~/TEMP/test_tantale") # tempdir(check = TRUE)
telltale2_outdir <- file.path(outdir, "telltale2")
dir.create(telltale2_outdir, showWarnings = T)

1 find TALs

We will run telltale2() without correction for “gold” genomes, MAI1 and BAI3, and telltale2() with correction for “error-prone” genome BAI3-1-1.
By default, telltale2() adds NTERM and CTERM to RVD sequences as markers of the termini, that is important to map the repeat codes by DisTal with the RVDs.

1.1 run telltale2 no correction

mclapply(c(mai1_fa, bai3_fa, pxo86), function(fastaFile) {
  outdirname <- gsub(".fa", "_nocorrection", basename(fastaFile))
  outputDir <- file.path(telltale2_outdir, outdirname)
  tellTale(subjectFile = fastaFile,
            outputDir = outputDir,
            appendExtremityCodes = TRUE,
            rvdSep = "-",
            talArrayCorrection = FALSE)
}, mc.cores = 2) 
#>> [[1]]
#>> [1] "/home/cunnac/TEMP/test_tantale/telltale2/MAI1_nocorrection"
#>> 
#>> [[2]]
#>> [1] "/home/cunnac/TEMP/test_tantale/telltale2/BAI3_nocorrection"
#>> 
#>> [[3]]
#>> [1] "/home/cunnac/TEMP/test_tantale/telltale2/PXO86_nocorrection"

1.2 run telltale2 correction

tellTale(subjectFile = bai311_fa,
         outputDir = file.path(telltale2_outdir, gsub(".fa", "_correction", basename(bai311_fa))),
         TALE_CtermDNAHitMinScore = 300,
         appendExtremityCodes = TRUE,
         rvdSep = "-",
         talArrayCorrection = TRUE)
#>> # HMMER 3.3 (Nov 2019); http://hmmer.org/
#>> # Copyright (C) 2019 Howard Hughes Medical Institute.
#>> Finding the closest reference amino acid sequences:
#>> ================================================================================
#>> 
#>> Time difference of 0.55 secs
#>> Assessing frameshifts in nucleotide sequences:
#>> ================================================================================
#>> 
#>> Time difference of 400.62 secs
#>> ##  Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ##   java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze  t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00001/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00001 
#>> ##  Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ##   java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze  t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00002/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00002 
#>> ##  Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ##   java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze  t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00003/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00003 
#>> ##  Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ##   java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze  t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00005/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00005 
#>> ##  Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ##   java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze  t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00006/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00006 
#>> ##  Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ##   java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze  t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00007/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00007 
#>> ##  Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ##   java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze  t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00008/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00008 
#>> ##  Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ##   java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze  t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00009/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00009
#>> #****************************************
#>> #**   tellTale analysis done     **
#>> Current date:   Tue Aug 29 14:16:33 2023
#>> #_________Provided I/O parameters __________
#>> File of subject DNA sequences:  /tmp/RtmpaJIkJj/file8c86a17eba4ab
#>> TALE N-term CDS region detection HMM file:  /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/extdata/hmmProfile/Xo_TALE_Nterm_CDS_profile.hmm
#>> TALE repeat unit CDS detection HMM file:    /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/extdata/hmmProfile/Xo_TALE_repeat_CDS_profile.hmm
#>> TALE C-term CDS region detection HMM file:  /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/extdata/hmmProfile/Xo_TALE_Cterm_CDS_profile.hmm
#>> Output directory:   /home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction
#>> #____________Other parameters________________
#>> TALE_NtermDNAHitMinScore    :   300
#>> repeatDNAHitMinScore    :   20
#>> TALE_CtermDNAHitMinScore    :   300
#>> minDomainHitsPerSubjSeq :   4
#>> mergeHits   :   TRUE
#>> minGapWidth :   35
#>> extendedLength  :   300
#>> talArrayCorrection  :   TRUE
#>> refForTalArrayCorrection    :   /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/extdata/decipher_ref_tales_aa.fa.gz
#>> frameShiftCorrection    :   -11
#>> #__________Summary measures of TALE search outcome__________
#>> Number of analysed subject sequences :  2
#>> Total number of TALE repeat DNA coding sequence motif hits found with the nhmmer approach:  155
#>> Total number of subject seqs with TALE motif hits after low hit number filtering:   1
#>> Total number of distinct regions (repeat arrays) with adjacent TALE motifs :    9
#>> Total number of 'complete' arrays (with both N- and C-term flanking motifs):    8
#>> Minimum array length (number of TALE domain hits):  1
#>> Maximum array length:   28
#>> Median array length:    20
#>> Number of gaps of size below 500nt between TALE motifs arrays:  1.5
#>> First quartile of size of gaps (below 500nt) between TALE motifs arrays:    108
#>> Median size of gaps (below 500nt) between TALE motifs arrays:   108
#>> Upper quartile of size of gaps (below 500nt) between TALE motifs arrays:    108
#>> #__________Noteworthy AnnoTale issues__________
#>> # 
#>> #*************************

1.3 overview of telltale2 results

Besides RVD sequences and Tal ORFs, other useful outputs of telltale2() are the reports, from which we can get information about the structure of Tal sequences, corrections have been made, and so on.

The first we should take a look is “arrayReport.tsv”.Wwe will take the files from 3 result folders, gather them, and see which information we can display from it.

# color set for ploting
sunset10 <- c("#ba6b57", "#87556f", "#848ccf", "#93b5e1", "#6f4a8e", "#a2738c", "#ab93c9", "#588da8", "#726a95", "#bc658d")

arrayReport_files <- list.files(telltale2_outdir, "arrayReport.tsv", recursive = T, full.names = T)

arrayRp <- tibble()
for (f in arrayReport_files) {
  a <- read_tsv(f)
  if (!any(c("predicted_ins_count", "predicted_dels_count") %in% colnames(a)))
    # 2 columns "predicted_ins_count" and "predicted_dels_count" are not displayed in the report in case of no correction
    {a[c("predicted_ins_count", "predicted_dels_count")] <- NA} 
  a$strain <- gsub("\\_.+", "", basename(dirname(f)))
  arrayRp <- rbind(arrayRp, a)
}
#>> Rows: 10 Columns: 15
#>> ── Column specification ────────────────────────────────────────────────────────
#>> Delimiter: "\t"
#>> chr (6): arrayID, OriginalSubjectName, Strand, ArraySeq, SeqOfRVD, LongestOR...
#>> dbl (7): Start, End, NumberOfHits, N.terminusAAlength, C.terminusAAlength, L...
#>> lgl (2): AllDomains, aberrantRepeat
#>> 
#>> ℹ Use `spec()` to retrieve the full column specification for this data.
#>> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#>> Rows: 9 Columns: 17
#>> ── Column specification ────────────────────────────────────────────────────────
#>> Delimiter: "\t"
#>> chr (6): arrayID, OriginalSubjectName, Strand, ArraySeq, SeqOfRVD, LongestOR...
#>> dbl (9): Start, End, NumberOfHits, predicted_ins_count, predicted_dels_count...
#>> lgl (2): AllDomains, aberrantRepeat
#>> 
#>> ℹ Use `spec()` to retrieve the full column specification for this data.
#>> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#>> Rows: 10 Columns: 15
#>> ── Column specification ────────────────────────────────────────────────────────
#>> Delimiter: "\t"
#>> chr (6): arrayID, OriginalSubjectName, Strand, ArraySeq, SeqOfRVD, LongestOR...
#>> dbl (7): Start, End, NumberOfHits, N.terminusAAlength, C.terminusAAlength, L...
#>> lgl (2): AllDomains, aberrantRepeat
#>> 
#>> ℹ Use `spec()` to retrieve the full column specification for this data.
#>> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#>> Rows: 19 Columns: 15
#>> ── Column specification ────────────────────────────────────────────────────────
#>> Delimiter: "\t"
#>> chr (6): arrayID, OriginalSubjectName, Strand, ArraySeq, SeqOfRVD, LongestOR...
#>> dbl (7): Start, End, NumberOfHits, N.terminusAAlength, C.terminusAAlength, L...
#>> lgl (2): AllDomains, aberrantRepeat
#>> 
#>> ℹ Use `spec()` to retrieve the full column specification for this data.
#>> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
arrayRp <- as_tibble(arrayRp)

This report takes result of HMMer for Tals motif detection, result of correction and AnnoTALE analyze.
What information it contains:

colnames(arrayRp)
#>>  [1] "arrayID"               "OriginalSubjectName"   "Start"                
#>>  [4] "End"                   "Strand"                "NumberOfHits"         
#>>  [7] "ArraySeq"              "AllDomains"            "SeqOfRVD"             
#>> [10] "aberrantRepeat"        "N.terminusAAlength"    "C.terminusAAlength"   
#>> [13] "LongestOrfLength"      "OrfCovOverArrayLength" "LongestORFSeq"        
#>> [16] "predicted_ins_count"   "predicted_dels_count"  "strain"

Some important points:

Column name Content
AllDomains TRUE/FALSE, whether all the 3 domains of Tal arrays are detected
aberrantRepeat TRUE/FALSE, whether there is any aberrant repeat
predicted_ins_count number of insertions corrected by telltale2()
predicted_dels_count number of deletions corrected by telltale2()

1.3.1 count AllDomains and aberrantRepeat

ggplot(arrayRp) +
  geom_bar(aes(x = strain, fill = AllDomains, color = aberrantRepeat), stat = "count") +
  scale_fill_manual(values = c("grey", sunset10[4])) +
  scale_color_manual(values = c("black", "red"), na.value = NA, labels = c("FALSE", "TRUE", "AnnoTALE_failed")) +
  guides(color = guide_legend(override.aes = list(fill = sunset10[4])))

1.3.2 count corrections

total correction count per genome (in this case, there is only 1 genome that has been corrected)

arrayRp_correction <- reshape2::melt(arrayRp[c("strain", "arrayID", "predicted_ins_count", "predicted_dels_count")], value.name = "count", variable.name = "type")
#>> Using strain, arrayID as id variables
arrayRp_correction %<>% dplyr::arrange(strain, arrayID)

ggplot(arrayRp_correction) +
  facet_wrap(vars(type), nrow = 2, scales = "fixed") +
  geom_bar(aes(x = strain, fill = factor(count))) +
  scale_fill_viridis_d() +
  guides(fill = guide_legend(title = "correction count"))

correction count per array per genome

ggplot(na.omit(arrayRp_correction), aes(x = arrayID, y = count, color = type, shape = type)) +
  facet_wrap(vars(strain), nrow = 2) +
  scale_shape_manual(values = c(20, 2)) +
  geom_point(size = 4) +
  scale_color_manual(values = c(sunset10[4], "black"))+
  theme(axis.text.x = element_text(angle = 90),
        legend.direction = "horizontal", legend.position = "bottom")

correction distribution per genome

ggplot(na.omit(arrayRp_correction), aes(x = strain, y = count)) +
  geom_dotplot(aes(fill = type, color = type), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 1/10) +
  scale_fill_manual(values = c(sunset10[4], "black")) +
  scale_color_manual(values = c(sunset10[4], "black")) +
  theme(legend.direction = "horizontal", legend.position = "bottom",
        panel.grid.minor.y = element_blank())

1.3.3 count domain codon

Another report that we can reveal is “hitsReport.tsv”, which contains results of HMMer.

hitsReport_files <- dir(telltale2_outdir, "hitsReport.tsv", recursive = T, full.names = T)

hitsReport <- data.frame()
for (f in hitsReport_files) {
  a <- read.table(f, sep = "\t", stringsAsFactors = F, header = T)
  a$strain <- gsub("\\_.+", "", basename(dirname(f)))
  hitsReport <- rbind(hitsReport, a)
}
hitsReport$query_name <- factor(hitsReport$query_name, levels = c("TALE_N-terminus_CDS_aligned_curated", "TALE_repeat_CDS_aligned_curated", "TALE_C-terminus_CDS_aligned_curated_long"), ordered = F)

information in the hitsReport

colnames(hitsReport)
#>>  [1] "arrayID"          "seqnames"         "start"            "end"             
#>>  [5] "width"            "strand"           "nhmmerHitID"      "query_name"      
#>>  [9] "hitID"            "seq"              "codon_count"      "frameshift_count"
#>> [13] "strain"

We could see the length distribution of each Tal protein domain (query_name) predicted by HMMer.

ggplot(hitsReport) +
  facet_wrap(vars(query_name), scales = "free", nrow = 1) +
  geom_bar(aes(x = factor(codon_count), fill = strain), stat = "count", position = position_dodge(preserve = "single")) +
  scale_fill_manual(values = sunset10) +
  scale_y_sqrt(breaks = c(0, 1, 5, 10, 15, 100, 200, 350))

It seems that BAI3-1-1 has 1 Tal truncated at C-terminus caused by artifactual indels. But this is just the “raw” result of Tal motif detection.

Now we have a look at “domainsReport.tsv”, a similar report as “hitsReport.tsv” except that it contains the information of Tal domains predicted by AnnoTALE after correction, , so that we can make comparision and infer the impact of correction in the protein sequences.

domainsReport_file <- dir(telltale2_outdir, "domainsReport.tsv", recursive = T, full.names = T)

domainsReport <- data.frame()
for (f in domainsReport_file) {
  a <- read.table(f, sep = "\t", stringsAsFactors = F, header = T)
  a$strain <- gsub("\\_.+", "", basename(dirname(f)))
  domainsReport <- rbind(domainsReport, a)
}

domainsReport$query_name <- factor(domainsReport$query_name, levels = c("N-terminus", "repeat", "C-terminus"))
domainsReport$codon_count <- as.factor(domainsReport$codon_count)

ggplot(domainsReport) +
  facet_wrap(vars(query_name), scales = "free", nrow = 1) +
  geom_bar(aes(x = codon_count, fill = strain), stat = "count", position = position_dodge(preserve = "single")) +
  scale_fill_manual(values = sunset10) +
  scale_y_sqrt(breaks = c(0, 1, 5, 10, 15, 100, 200, 350))

Firstly, the domain detection of AnnoTALE is different from HMMer at some amino acids.
Secondly, the truncated C-terminus of BAI3-1-1 (84 aa) has been corrected and has the same length as its counterparts.

2 classify TAL groups

To classify Tal groups, we use function runDistal() to run the tool DisTal that outputs similarity between Tals and similarity between repeats, then the Tals similarity can be taken as input of groupTales() for clustering groups, and the repeats similarity can be used for plotting multiple alignment of Tals within group.

2.1 run DisTal

We run DisTal for all the putative TAL ORFs (from telltale2 output).

distal_telltale <- file.path(outdir, "distal")
unlink(distal_telltale, recursive = TRUE)
dir.create(distal_telltale, showWarnings = F)
telltale2_cds <- file.path(distal_telltale, "cds.fasta")
unlink(telltale2_cds)


distal_telltale_input <- list.files(telltale2_outdir,
                                    "putativeTalOrf.fasta",
                                    recursive = T, full.names = T)
distal_telltale_input <- distal_telltale_input[!grepl("ROI", distal_telltale_input, fixed = T)]
for (input in distal_telltale_input) {
  ttRunId <- gsub("\\_.+", "", basename(dirname(input)))
  seqs <- readBStringSet(input, seek.first.rec = T)
  names(seqs) <- paste0(ttRunId, "_", names(seqs))
  seqs <- seqs[width(seqs) != 0]
  #cat(names(seqs))
  writeXStringSet(seqs, telltale2_cds, append = T)
}

# run DisTal
distal_output <- runDistal(telltale2_cds, outdir = distal_telltale, overwrite = T)

# distal_output$tal.similarity %>%as_tibble() %>% filter(TAL1 == | TAL2 == )

2.2 run the R implementation of DisTal

distalrOutDir <- file.path(outdir, "distalr")
unlink(distalrOutDir, recursive = TRUE)
dir.create(distalrOutDir, showWarnings = F)


taleParts <- lapply(list.dirs(telltale2_outdir, recursive = FALSE), function(ttGenomeDir) {
  genomeId <- gsub("_.*correction", "", basename(ttGenomeDir))
  logger::log_debug("Current genome: {genomeId}")
  taleParts <- getTaleParts(ttGenomeDir) %>%
    mutate(arrayID = paste0(genomeId, "_", arrayID))
  return(taleParts)
}) %>%
  bind_rows()


partsForPlots <- taleParts %>%
  mutate(label = if_else(domainType == "repeat", rvd, ""),
         aaSeqLength = factor(nchar(aaSeq))
  )
partsForPlots %>% ggplot(mapping = aes(fill = aaSeqLength,
                                       color = domainType,
                                       label = label,
                                       y = arrayID,
                                       x = positionInArray),
                         color = isNaAaSeq) +
  scale_color_viridis_d(option = "rocket") +
  scale_fill_discrete() +
  scale_x_continuous(breaks = 1:50, minor_breaks = NULL) +
  geom_point(shape = 21, size = 5, stroke = 0.9) +
  ggnewscale::new_scale_color() +
  ggnewscale::new_scale_fill() +
  geom_text(size = 2.1, color = "white") + 
  facet_grid(seqnames~ ., scales = "free_y", space = "free") +
  theme_light()




partsForDistalr <- taleParts
# distalr_mms_output <- distalr(taleParts = partsForDistalr,
#                               repeats.cluster.h.cut = 10, ncores = 6,
#                               pairwiseAlnMethod = "mmseq2",
#                               condaBinPath = "/home/cunnac/bin/miniconda3/condabin/conda")

distalr_deci_output <- distalr(taleParts = partsForDistalr,
                              repeats.cluster.h.cut = 10, ncores = 6,
                              pairwiseAlnMethod = "DECIPHER")

distalr_bs_output <- distalr(taleParts = partsForDistalr,
                             repeats.cluster.h.cut = 10, ncores = 6,
                             pairwiseAlnMethod = "Biostrings")

2.3 Contrast the output of the various distal functions

library('corrr')
library(ggcorrplot)
talSimOutAgregated <- list(distalALV = distal_output$tal.similarity,
     distalDECI = distalr_deci_output$tal.similarity,
     distalBIOS = distalr_bs_output$tal.similarity
) %>% bind_rows(.id = "method")

talSimOutAgregatedLong <- reshape2::dcast(talSimOutAgregated %>% select(method, TAL1, TAL2, Sim),
                                          formula = method ~ TAL1 + TAL2, value.var = "Sim"
                                            )
rownames(talSimOutAgregatedLong) <- talSimOutAgregatedLong$method
talSimOutAgregatedLong$method <- NULL
talSimOutAgregatedLong %<>% as.matrix() %>% t()
rownames(talSimOutAgregatedLong) <- NULL

corr_matrix <- cor(talSimOutAgregatedLong)
knitr::kable(corr_matrix)
distalALV distalBIOS distalDECI
distalALV 1.0000000 0.9533716 0.9547991
distalBIOS 0.9533716 1.0000000 0.9961920
distalDECI 0.9547991 0.9961920 1.0000000
ggcorrplot(corr_matrix)



talSimOutAgregated <- reshape2::dcast(talSimOutAgregated %>% select(method, TAL1, TAL2, Sim),
                                          formula = TAL1 + TAL2 ~ method, value.var = "Sim"
                                            ) %>% as_tibble()
talSimOutAgregated %>%
  select(-distalBIOS) %>%
  mutate(pointLab = paste(TAL1, TAL2, sep = "|")) %>%
  ggplot(mapping = aes(x = distalALV, distalDECI)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1)

k <- 26
groupsALV <- invisible(groupTales(taleSim = distal_output$tal.similarity,
                     plotTree = TRUE, k = k, k_test = 10:40,
                     method = "k-medoids"))
#>> tale tree will not be plotted!
#>> Number of groups is decided based on the provided value of k: 26


groupsDECI <- invisible(groupTales(taleSim = distalr_deci_output$tal.similarity,
                     plotTree = TRUE, k = k, k_test = 10:40,
                     method = "k-medoids"))
#>> tale tree will not be plotted!
#>> Number of groups is decided based on the provided value of k: 26


groupsBIOS <- invisible(groupTales(taleSim = distalr_bs_output$tal.similarity,
                     plotTree = TRUE, k = k, k_test = 10:40,
                     method = "k-medoids"))
#>> tale tree will not be plotted!
#>> Number of groups is decided based on the provided value of k: 26



groupTales(taleSim = distal_output$tal.similarity,
                     plotTree = TRUE, k = k,
                     method = "hclust")

#>>                  name group
#>> 1      BAI3_ROI_00001     1
#>> 2      BAI3_ROI_00002     2
#>> 3      BAI3_ROI_00003     3
#>> 4      BAI3_ROI_00004     4
#>> 5      BAI3_ROI_00006     5
#>> 6      BAI3_ROI_00007     6
#>> 7      BAI3_ROI_00008     7
#>> 8      BAI3_ROI_00009     8
#>> 9      BAI3_ROI_00010     9
#>> 10 BAI3-1-1_ROI_00001     2
#>> 11 BAI3-1-1_ROI_00002     3
#>> 12 BAI3-1-1_ROI_00003     4
#>> 13 BAI3-1-1_ROI_00005     5
#>> 14 BAI3-1-1_ROI_00006     6
#>> 15 BAI3-1-1_ROI_00007     7
#>> 16 BAI3-1-1_ROI_00008     8
#>> 17 BAI3-1-1_ROI_00009     9
#>> 18     MAI1_ROI_00001     1
#>> 19     MAI1_ROI_00002     2
#>> 20     MAI1_ROI_00003    10
#>> 21     MAI1_ROI_00004     4
#>> 22     MAI1_ROI_00006     5
#>> 23     MAI1_ROI_00007     6
#>> 24     MAI1_ROI_00008     7
#>> 25     MAI1_ROI_00009     8
#>> 26     MAI1_ROI_00010     9
#>> 27    PXO86_ROI_00001    11
#>> 28    PXO86_ROI_00002    12
#>> 29    PXO86_ROI_00003    13
#>> 30    PXO86_ROI_00004    14
#>> 31    PXO86_ROI_00006    15
#>> 32    PXO86_ROI_00007    16
#>> 33    PXO86_ROI_00008    17
#>> 34    PXO86_ROI_00009    18
#>> 35    PXO86_ROI_00010    19
#>> 36    PXO86_ROI_00011    20
#>> 37    PXO86_ROI_00012    21
#>> 38    PXO86_ROI_00013    22
#>> 39    PXO86_ROI_00014    23
#>> 40    PXO86_ROI_00015    23
#>> 41    PXO86_ROI_00016    24
#>> 42    PXO86_ROI_00017    25
#>> 43    PXO86_ROI_00018    26
#>> 44    PXO86_ROI_00019    11
groupTales(taleSim = distalr_deci_output$tal.similarity,
                     plotTree = TRUE, k = k,
                     method = "hclust")

#>>                  name group
#>> 1      BAI3_ROI_00001     1
#>> 2      BAI3_ROI_00002     2
#>> 3      BAI3_ROI_00003     3
#>> 4      BAI3_ROI_00004     4
#>> 5      BAI3_ROI_00006     5
#>> 6      BAI3_ROI_00007     6
#>> 7      BAI3_ROI_00008     7
#>> 8      BAI3_ROI_00009     8
#>> 9      BAI3_ROI_00010     9
#>> 10 BAI3-1-1_ROI_00001     2
#>> 11 BAI3-1-1_ROI_00002     3
#>> 12 BAI3-1-1_ROI_00003     4
#>> 13 BAI3-1-1_ROI_00005     5
#>> 14 BAI3-1-1_ROI_00006     6
#>> 15 BAI3-1-1_ROI_00007     7
#>> 16 BAI3-1-1_ROI_00008     8
#>> 17 BAI3-1-1_ROI_00009     9
#>> 18     MAI1_ROI_00001     1
#>> 19     MAI1_ROI_00002     2
#>> 20     MAI1_ROI_00003     3
#>> 21     MAI1_ROI_00004     4
#>> 22     MAI1_ROI_00006     5
#>> 23     MAI1_ROI_00007    10
#>> 24     MAI1_ROI_00008     7
#>> 25     MAI1_ROI_00009     8
#>> 26     MAI1_ROI_00010     9
#>> 27    PXO86_ROI_00001    11
#>> 28    PXO86_ROI_00002    12
#>> 29    PXO86_ROI_00003    13
#>> 30    PXO86_ROI_00004    14
#>> 31    PXO86_ROI_00006    15
#>> 32    PXO86_ROI_00007    16
#>> 33    PXO86_ROI_00008    17
#>> 34    PXO86_ROI_00009    18
#>> 35    PXO86_ROI_00010    19
#>> 36    PXO86_ROI_00011    20
#>> 37    PXO86_ROI_00012    21
#>> 38    PXO86_ROI_00013    22
#>> 39    PXO86_ROI_00014    23
#>> 40    PXO86_ROI_00015    23
#>> 41    PXO86_ROI_00016    24
#>> 42    PXO86_ROI_00017    25
#>> 43    PXO86_ROI_00018    26
#>> 44    PXO86_ROI_00019    11
groupTales(taleSim = distalr_bs_output$tal.similarity,
                     plotTree = TRUE, k = k,
                     method = "hclust")

#>>                  name group
#>> 1      BAI3_ROI_00001     1
#>> 2      BAI3_ROI_00002     2
#>> 3      BAI3_ROI_00003     3
#>> 4      BAI3_ROI_00004     4
#>> 5      BAI3_ROI_00006     5
#>> 6      BAI3_ROI_00007     6
#>> 7      BAI3_ROI_00008     7
#>> 8      BAI3_ROI_00009     8
#>> 9      BAI3_ROI_00010     9
#>> 10 BAI3-1-1_ROI_00001     2
#>> 11 BAI3-1-1_ROI_00002     3
#>> 12 BAI3-1-1_ROI_00003     4
#>> 13 BAI3-1-1_ROI_00005     5
#>> 14 BAI3-1-1_ROI_00006     6
#>> 15 BAI3-1-1_ROI_00007     7
#>> 16 BAI3-1-1_ROI_00008     8
#>> 17 BAI3-1-1_ROI_00009     9
#>> 18     MAI1_ROI_00001     1
#>> 19     MAI1_ROI_00002     2
#>> 20     MAI1_ROI_00003     3
#>> 21     MAI1_ROI_00004     4
#>> 22     MAI1_ROI_00006     5
#>> 23     MAI1_ROI_00007    10
#>> 24     MAI1_ROI_00008     7
#>> 25     MAI1_ROI_00009     8
#>> 26     MAI1_ROI_00010     9
#>> 27    PXO86_ROI_00001    11
#>> 28    PXO86_ROI_00002    12
#>> 29    PXO86_ROI_00003    13
#>> 30    PXO86_ROI_00004    14
#>> 31    PXO86_ROI_00006    15
#>> 32    PXO86_ROI_00007    16
#>> 33    PXO86_ROI_00008    17
#>> 34    PXO86_ROI_00009    18
#>> 35    PXO86_ROI_00010    19
#>> 36    PXO86_ROI_00011    20
#>> 37    PXO86_ROI_00012    21
#>> 38    PXO86_ROI_00013    22
#>> 39    PXO86_ROI_00014    23
#>> 40    PXO86_ROI_00015    23
#>> 41    PXO86_ROI_00016    24
#>> 42    PXO86_ROI_00017    25
#>> 43    PXO86_ROI_00018    26
#>> 44    PXO86_ROI_00019    11



getTALEsTree <- function(taleSim) {
  distMat <- 100 - reshape2::acast(taleSim, formula = TAL1 ~ TAL2, value.var = "Sim")
  hclust(dist(distMat, method = "euclidean"), method = "ward.D")
}

getTALEsTree(distal_output$tal.similarity) %>%
  ggtree::ggtree() +
  ggtree::layout_dendrogram() +
  ggtree::geom_tiplab(ggtree::aes(label=label),
                          angle= 90, hjust=0,
                          offset = -3,
                      
                      , color='black')


getTALEsTree(distalr_deci_output$tal.similarity) %>%
  ggtree::ggtree() +
  ggtree::layout_dendrogram() +
  ggtree::geom_tiplab(ggtree::aes(label=label),
                          angle= 90, hjust=0,
                          offset = -3,
                      
                      , color='black')


getTALEsTree(distalr_bs_output$tal.similarity) %>%
  ggtree::ggtree() +
  ggtree::layout_dendrogram() +
  ggtree::geom_tiplab(ggtree::aes(label=label),
                          angle= 90, hjust=0,
                          offset = -3,
                      
                      , color='black')

ape::cophyloplot(getTALEsTree(distal_output$tal.similarity) %>% as.phylo(),
                 getTALEsTree(distalr_deci_output$tal.similarity) %>% as.phylo(),
                 assoc = matrix(rep(distal_output$tal.similarity$TAL1 %>% unique() %>% as.character(), 2),
                                ncol = 2, byrow = FALSE),
                 length.line = 0, space=200, gap=10,
                 col = "red")

2.4 classify TAL groups

taleGroups <- groupsBIOS

Based on the silhouette values, groupTales() will take the value at the elbow of the curve. In this case it picked k = 9, and it makes sense as we already know that the each of the gold genomes have 9 Tals.

2.4.1 msa with similarity

We can make MSA of repeatID sequence or RVD sequence with the function buildRepeatMsa(), however it make more sense to align repeat than RVD. For mafft, the combo of gap opening score = 0 and gap extending score = 5 creates better alignment than default values.

repeatVectors <- distalr_deci_output$coded.repeats.str
repeatSim <- distalr_deci_output$repeat.similarity %>% select(-Dissim) #distal_output$repeat.similarity
repeatMsaByGroup_withSim <- lapply(sort(unique(taleGroups$group)), function(G) {
  repeatVectorsForGroup <- repeatVectors[taleGroups$name[taleGroups$group == G]]
  if (length(repeatVectorsForGroup) < 2) {
    msa <- as.matrix(as.data.frame(repeatVectorsForGroup))
    msa <- matrix(msa, nrow = 1)
    rownames(msa) <- colnames(as.data.frame(repeatVectorsForGroup))
    colnames(msa) <- 1:length(msa)
    return(msa)
    next()
  }
  repeatMsa <- buildRepeatMsa(inputSeqs = repeatVectorsForGroup, sep = " ",
                              distalRepeatSims = repeatSim,
                              mafftOpts = "--globalpair --weighti 1 --maxiterate 1000 --reorder --op 0 --ep 5 --thread 1")
  }
)

2.4.1.1 msa of repeat similarity

For this type of plot, we need to map repeatID to RVD with getRepeat2RvdMapping(), then convert repeat MSA to RVD MSA by convertRepeat2RvdAlign. We should use the converting function rather than using buildRepeatMsa() to creat MSA of RVD because there will be disagreement between them.

rvdSeqs_fasta <- list.files(telltale2_outdir, "rvdSequences.fas", recursive = T, full.names = T)

telltale2_rvd <- file.path(distal_telltale, "rvd.fasta")
unlink(telltale2_rvd)

# rename of RVD sequences to distinguish sequences between different strains
for (input in rvdSeqs_fasta) {
  id <- gsub("\\_.+", "", basename(dirname(input)))
  seqs <- readBStringSet(input, seek.first.rec = T)
  names(seqs) <- paste0(id, "_", names(seqs))
  names(seqs) <- stringr::str_trim(names(seqs))
  seqs <- seqs[width(seqs) != 0]
  writeXStringSet(seqs, telltale2_rvd, append = T)
}


rvdVectors <- toListOfSplitedStr(telltale2_rvd, sep = "-")
repeatVectors <- toListOfSplitedStr(distalr_deci_output$coded.repeats.str, sep = " ")

# map repeatID to RVD
rep2rvd <- getRepeat2RvdMapping(talesRepeatVectors = repeatVectors, talesRvdVectors = rvdVectors)
datatable(rep2rvd, options = list(pageLength = 5))

# convert repeat MSA to RVD MSA
repeatMSA <- repeatMsaByGroup_withSim[[2]]
rvdMSA <-  convertRepeat2RvdAlign(repeatAlign = repeatMSA, repeat2RvdMapping = rep2rvd)

# plot MSA
tantale::heatmap_msa(talsim = distalr_deci_output$tal.similarity,
                     repeatAlign = repeatMSA,
                     repeatSim = distalr_deci_output$repeat.similarity,
                     rvdAlign = rvdMSA,
                     plot.type = "repeat.similarity",
                     main = "Repeat MSA with similarity - Group 3")

2.4.1.2 msa of repeat cluster ID

tantale::heatmap_msa(talsim = distalr_deci_output$tal.similarity,
                     repeatAlign = repeatMSA,
                     repeatSim = distalr_deci_output$repeat.similarity,
                     plot.type = "repeat.clusters",
                     main = "Repeat Cluster MSA - Group 3")

2.4.1.3 msa of repeat ID with RVD labeled

For this plot, we also use the RVD MSA converted from repeat MSA.

rvdMSA <-  convertRepeat2RvdAlign(repeatAlign = repeatMSA, repeat2RvdMapping = rep2rvd)
tantale::heatmap_msa(talsim = distalr_deci_output$tal.similarity,
                     repeatAlign = repeatMSA,
                     repeatSim = distalr_deci_output$repeat.similarity,
                     rvdAlign = rvdMSA,
                     plot.type = "repeat.clusters.with.rvd",
                     main = "Repeat Cluster MSA with RVD - Group 3",
                     consensusSeq = TRUE)

3 overview of talomes

With the Tal classification result and RVD sequences, we can represent RVD sequence variants of all the strains for each Tal group as an example below.

tale_annotation <- merge(taleGroups, 
                         data.frame("name" = names(rvdVectors),
                                    "rvdseq" = sapply(rvdVectors,
                                                      function(i) paste(i, collapse = "-")
                                                      ), stringsAsFactors = FALSE
                                    )
                         )
tale_annotation$strain <- gsub("\\_.+", "", tale_annotation$name)

heatmap_talomes(tale_annotation,
                col = "group", row = "strain",
                value = "rvdseq",
                mapcol = sunset10)

4 predict target

We wrap Talvez and Preditale tools in 2 functions, talvez() and preditale(), that take the same types of RVD sequences and promoter DNA sequences as inputs and output a data frame containing predictions.

We need to concatenate all RVD sequences from 3 telltale2 outputs and remove NTERM and CTERM markers.

## rvd sequences from telltale2 output
rvdSeqs_tt <-  readBStringSet(telltale2_rvd)
rvdSeqs_tt <- gsub("\\-{0,1}\\w{5}\\-{0,1}", "",rvdSeqs_tt) %>% BStringSet() # remove NTERM and CTERM
predict_input <- file.path(outdir, "rvd_to_predict.fasta")
writeXStringSet(rvdSeqs_tt, predict_input)


## promoter sequences
rvdSeqsXstrings <-  predict_input
subjDnaSeqFile <-  system.file("extdata", "cladeIII_sweet_promoters.fasta", package = "tantale", mustWork = T)
readBStringSet(subjDnaSeqFile) %>% names
#>> [1] "SWEET11p_IR64_Sense"       "SWEET11p_BT07_Sense"      
#>> [3] "SWEET11p_93-11_Sense"      "SWEET11p_Nipponbare_Sense"
#>> [5] "SWEET13p_BT7_Sense"        "SWEET13p_IR24_Sense"      
#>> [7] "SWEET13p_Nipponbare_Sense" "SWEET14p_BT07_Sense"      
#>> [9] "SWEET14p_Nipponbare_Sense"

4.1 talvez

The output of talvez() contains the position, score and rank of each prediction.

talvezPreds <- talvez(rvdSeqs = rvdSeqsXstrings,
                      subjDnaSeqFile = subjDnaSeqFile,
                      optParam = "-t 0 -l 19",
                      condaBinPath = "/home/cunnac/bin/miniconda3/condabin/conda")
datatable(talvezPreds, options = list(pageLength = 1))

Here is an example of summarizing prediction result with heatmap.2() (scores are display by color scale, black refers no prediction):

talvezPreds$strain <- gsub("\\_.+", "", talvezPreds$taleId)
talvezPreds$group <- sapply(talvezPreds$taleId, function(id) {
  ifelse(id %in% taleGroups$name, taleGroups$group[taleGroups$name == id], NA)
}, USE.NAMES = F)

slctPromoters <- c("SWEET14p_Nipponbare_Sense", "SWEET13p_Nipponbare_Sense", "SWEET11p_Nipponbare_Sense")
selectedPreds <- talvezPreds %>%
  dplyr::filter(subjSeqId %in% slctPromoters) 

selectedPredsMat <- selectedPreds %>%
  reshape2::acast(taleId ~ subjSeqId, max, na.rm = TRUE, value.var = "score", fill = as.single(NA))

heatmap.2(selectedPredsMat,
                  col = viridis::viridis(20), #cm.colors(255),
                  sepwidth=c(0.02,0.02), sepcolor="white", colsep = 1:ncol(selectedPredsMat), rowsep = 1:nrow(selectedPredsMat),
                  trace="none",
                  margins = c(10, 15),
                  cexRow = 1,
                  cexCol = 1, srtCol = 40,
                  density.info= "histogram", key.xlab = "Pred. Score", key.title = NA,
                  lwid = c(1,5), lhei = c(1,5),
                  dendrogram = "none", Rowv = NULL, Colv = NULL,
                  na.color = "black",
                  main = "Talvez prediction"
                  )

To see how RVD sequences match a promoter region, we supply the prediction table and the promoter sequence with the genomic range to plotTaleTargetPred():

grFilter <- "SWEET14p_Nipponbare_Sense:340-450"
plotTaleTargetPred(predResults = selectedPreds, subjDnaSeqFile = subjDnaSeqFile, filterRange = grFilter)
#>> Registered S3 methods overwritten by 'ggalt':
#>>   method                  from   
#>>   grid.draw.absoluteGrob  ggplot2
#>>   grobHeight.absoluteGrob ggplot2
#>>   grobWidth.absoluteGrob  ggplot2
#>>   grobX.absoluteGrob      ggplot2
#>>   grobY.absoluteGrob      ggplot2

4.2 preditale

With Preditale, we get similar data frame output but with p value additionally. Preditale takes a little bit longer than Talvez and may give some different predictions.

preditalePreds <- preditale(rvdSeqs = rvdSeqsXstrings, subjDnaSeqFile = subjDnaSeqFile, outDir = NULL)
datatable(preditalePreds, options = list(pageLength = 1))
talvezPreds$strain <- gsub("\\_.+", "", talvezPreds$taleId)
talvezPreds$group <- sapply(talvezPreds$taleId, function(id) {
  ifelse(id %in% taleGroups$name, taleGroups$group[taleGroups$name == id], NA)
}, USE.NAMES = F)

slctPromoters <- c("SWEET14p_Nipponbare_Sense", "SWEET13p_Nipponbare_Sense", "SWEET11p_Nipponbare_Sense")
selectedPreds <- preditalePreds %>%
  dplyr::filter(subjSeqId %in% slctPromoters) 

selectedPredsMat <- selectedPreds %>%
  reshape2::acast(taleId ~ subjSeqId, max, na.rm = TRUE, value.var = "score", fill = as.single(NA))

gplots::heatmap.2(selectedPredsMat,
                  col = viridis::viridis(20), #cm.colors(255),
                  sepwidth=c(0.02,0.02), sepcolor="white", colsep = 1:ncol(selectedPredsMat), rowsep = 1:nrow(selectedPredsMat),
                  trace="none",
                  margins = c(10, 15),
                  cexRow = 1,
                  cexCol = 1, srtCol = 40,
                  density.info= "histogram", key.xlab = "Pred. Score", key.title = NA,
                  lwid = c(1,5), lhei = c(1,5),
                  dendrogram = "none", Rowv = NULL, Colv = NULL,
                  na.color = "black",
                  main = "Preditale prediction"
                  )

grFilter <- "SWEET14p_Nipponbare_Sense:340-450"
plotTaleTargetPred(predResults = selectedPreds, subjDnaSeqFile = subjDnaSeqFile, filterRange = grFilter)